library(PerformanceAnalytics)
library(MASS)
library(tm) 
library(tidyr)
library(plyr)
library(dplyr)
library(tidyverse)
library(caret)
library(class)
library(e1071)
library(data.table)
library(gganimate)
library(GGally)
require(ggthemes)
library(rgr)  
library(ggpubr)
library(srvyr)
library(mltools)
library(data.table)
library(ggcorrplot)
data = read.csv("/Users/dgrijalva/SMU/Classes/Term Fall 2020/Doing-Data-Science/Units/project2/Data/CaseStudy2-data.csv")

data =data %>% mutate(Attrition_binary = revalue(factor(data$Attrition),
                                       c("Yes" = "1", "No" = "0")))
data %>% ggplot(aes(x=Attrition, fill=Attrition)) + geom_bar(stat="count") + ggtitle("Attrition Yes vs No") + theme_clean()

There seems to be a heavy class imbalance. There seems to be a ~5:1 ratio, favoring to employees with no attrition. This might lead to problems creating an attrition prediction problem that generalizes well.

#Functions
We will start by creating functions to each feature appropriately and dynamically. Three functions will be created:
1. This function will graph a histogram of the overall data, and data filtered by attrition yes and no.
2. This function will graph a bar chart of the percentage of people who left the company by feature type and value.
3. This function will graph a density plot of the attrition vs no attrition distribution per feature.

feature_graphs = function (data, input_stat, feature) {

  g1= data  %>% select(feature)  %>% gather() %>% ggplot(aes(value)) + geom_histogram(fill="steelblue", bins=10, stat=input_stat)  + ggtitle("Overall Data")  + theme_clean()+ 
  theme(axis.text.x = element_text(angle = 60, hjust = 1, size = 6 )) + xlab(feature)


  g2= data   %>% filter(Attrition=="Yes")  %>% select(feature)  %>% gather() %>% ggplot(aes(value, fill="steelblue")) + geom_histogram(fill="steelblue", bins=10, stat=input_stat)   + ggtitle("Attrition == Yes")  + theme_clean()+ 
  theme(axis.text.x = element_text(angle = 60, hjust = 1, size = 6 )) + xlab(feature)

  g3= data   %>% filter(Attrition=="No")  %>% select(feature)   %>% gather() %>% ggplot(aes(value, fill="steelblue")) + geom_histogram(fill="steelblue", bins=10, stat=input_stat)   + ggtitle("Attrition == No") + theme_clean()+ 
  theme(axis.text.x = element_text(angle = 60, hjust = 1, size = 6 )) + xlab(feature)
  
  figure =  ggarrange(g1, g2, g3, ncol=3)
  figure
  
}


att_ptc = function (data, select_field){
  df = data.frame(Field = c(), Attrition_ptc  = c())
  for (i in unlist(unique(data[select_field]))) {
    ratio  = nrow( data %>%  filter(Attrition == "Yes" & .[[select_field]] == i)) /nrow(data %>%filter(.[[select_field]]==i))
    temp = data.frame(Field = i, Attrition_ptc=ratio)
    df = rbind(df, temp)
     
  }
  title = paste("Attrition Percentage By ", select_field)
    figure = df %>% ggplot() + geom_bar(aes(y=Attrition_ptc, x=Field),fill="steelblue", stat="identity") + ylab("Attrition Percentage") + xlab(select_field) + ggtitle(paste0(title)) + theme_clean() + theme(axis.text.x = element_text(angle = 60, hjust = 1, size = 6 ))
    print(figure)
    
  }


density_plot = function(data, feature){
  feature = as.name(feature)
  title = paste("Dentisty Plot: ", feature)
  figure = data%>% select(Attrition, feature) %>% ggplot(
              aes(x = UQ(as.name(feature)), fill = Attrition)) + 
              geom_density(alpha = 0.7) + 
              scale_fill_manual(values = c("#386cb0","#fdb462")) + theme_clean() + ggtitle(paste0(title))
  return (figure)
  
}

Numerical Feature Distributions

Let’s take a look at the numeric (int) feature distribution for the overall data, attrition yes and no.
This will give us a visual representation of the differences between attrition yes and no.

num_variables = c("Age", "DailyRate", "DistanceFromHome", "HourlyRate", "NumCompaniesWorked", "PercentSalaryHike", "YearsAtCompany", "YearsInCurrentRole", "YearsSinceLastPromotion", "YearsWithCurrManager","TotalWorkingYears", "TrainingTimesLastYear", "MonthlyIncome", "MonthlyRate", "PercentSalaryHike")


for (i in num_variables) {
  print(feature_graphs(data, "bin", i))
  print(density_plot(data,i))
  att_ptc(data, i)
}

## Observations
- Many of the numerical features are right skewed.
- The age feature for employees with attrition seems to be heavy tailed towards right compared to employees who haven’t left the company.
- Employees is their 20’s are those with the higher attrition percentage.
- Lower monthly income seems to be more prominent among employees that left the company.

Categorical (nominal) Feature Distributions

Let’s take a look at the categorical (fctr) feature distribution for the overall data, attrition yes and no.
This will give us a visual representation of the differences.

cat_nominal_variables = c("BusinessTravel", "Department", "EducationField", "Gender", "JobRole", "MaritalStatus", "OverTime")

for (i in cat_nominal_variables) {
  print(feature_graphs(data, "count", i))
  att_ptc(data, i)

}

- There seems to be higher attrition on the following education fields: human resources and technical degrees.
- Attrition is more common among males.
- Attrition is more common among employees who worked overtime.
- Attrition is more common among single employees.
- Attrition is more common among employees who travel frequently.

Categorical (ordinal) Feature Distributions

Let’s take a look at the categorical (int) feature distribution for the overall data, attrition yes and no.
This will give us a visual representation of the differences.

cat_nominal_features_overall = c("JobSatisfaction", "EnvironmentSatisfaction", "JobLevel", "WorkLifeBalance", "JobInvolvement", "PerformanceRating", "Education", "RelationshipSatisfaction", "StockOptionLevel") 


for (i in cat_nominal_features_overall) {
  print(feature_graphs(data, "count", i))
  print(density_plot(data,i))
  att_ptc(data, i)
}

What are the main factors leading to attrition?

From looking at the numerical and categorical (nominal and ordinal) we can conclude that the main factors leading to attrition are the following:
- Employees who travel frequently show a higher attrition rate compared to employees who travel rarely or don’t travel at all.
- The job title with the higher attrition percentage is sales representative.
- Employees with a higher salary are less likely to result in leaving the company.
- According to the data, single employees are more likely to result in attrition.
- Employees how have previously worked in several companies show a higher percentage of attrition.
- Employees that work overtime are more likely to result on attrition.
- Employees in their 20’s are more likely to leave the company.

Model Building

Now let’s create two predictive models using the features provided in the dataset.
1. We will create a a KNN model to predict attrition
2. We will create a multiple-linear regression model to predict monthly income

The first thing we will do is encode the categorical values. This will allow us to use this features in a KNN models

# one hot encoding text variables
nominal_categories = data %>% select(cat_nominal_variables)
one_encode_data =  one_hot(as.data.table(nominal_categories))

Now, we need to create a train and test data set. We will use a 70-30 percent split.

# build data set with all variables needed
predict_data = data %>% select(num_variables, cat_nominal_features_overall)
cl = data$Attrition_binary
predict_data = cbind(predict_data, one_encode_data)
predict_data = cbind(predict_data, cl)

set.seed(20)
splitPerc = .70

# Split the dataset into train and test
trainIndices = sample(1:dim(predict_data)[1],round(splitPerc * dim(predict_data)[1]))
train = predict_data[trainIndices,]
test = predict_data[-trainIndices,]

Now that we have a train and test set. We will proceed to preprocess the data. The only preprocessing we will perform is scaling the features. We have several features, each of different scales. This might hurt the predictive model performance. For scaling, we will use the “scale” method using the train data only, this will prevent data leakage.

# Transform the data into the same scale
pp = preProcess( train[,1:51], method = c("scale"))
scaled_train = predict(pp, train[,1:51])
scaled_test = predict(pp, test[,1:51])

dim(scaled_train)
## [1] 609  51
dim(scaled_test)
## [1] 261  51
cla_test = as.vector(test$cl)

KNN - Predicting attrition

Now that both the train and test sets have been preprocess we will build the KNN model.
The first step will be to choose which is the K that provided the best sensitivity and specificity on the test data. We will iterate the model training using K’s ranging from 1 to 50.
We will use the best K to build our final model.
For this model, we will have a success metric of achieving a minimum of 60% on both sensitivity and specificity.

#library("crossval")
#Choose the best K
set.seed(123)

# Run iterations to find the best K
iterations = 50
accs = data.frame(accuracy = numeric(iterations), k = numeric(iterations))

for(i in 1:iterations)
{
  classification = knn(scaled_train, scaled_test,train$cl,k=i)
  cm = confusionMatrix(table(classification,test$cl), positive="1")

  
  accs$accuracy[i] = cm$overall[1]
  accs$k[i] = i
  accs$sensitivity[i] = cm$byClass[1]
  accs$specificity[i] = cm$byClass[2]
}

# Plot the K values with accuracy variation
ggplot() + geom_line(aes(accs$k,accs$accuracy, color="blue")) + ggtitle("Best Value of K") +  xlab("Value of k") + ylab("Percentage") + geom_line(aes(accs$k,accs$sensitivity, color="red")) + geom_line(aes(accs$k,accs$specificity, color="orange")) + scale_color_discrete(name = "Y series", labels = c("Accuracy","Specificity","Sensisivity")) + theme_clean()

classification = knn(scaled_train, scaled_test,train$cl,k=5)
levels(test) = c("Attrition", "No Attrition")
cm = confusionMatrix(table(classification, test$cl), positive="1", )

cm
## Confusion Matrix and Statistics
## 
##               
## classification   0   1
##              0 221  31
##              1   4   5
##                                           
##                Accuracy : 0.8659          
##                  95% CI : (0.8185, 0.9048)
##     No Information Rate : 0.8621          
##     P-Value [Acc > NIR] : 0.4728          
##                                           
##                   Kappa : 0.1768          
##                                           
##  Mcnemar's Test P-Value : 1.109e-05       
##                                           
##             Sensitivity : 0.13889         
##             Specificity : 0.98222         
##          Pos Pred Value : 0.55556         
##          Neg Pred Value : 0.87698         
##              Prevalence : 0.13793         
##          Detection Rate : 0.01916         
##    Detection Prevalence : 0.03448         
##       Balanced Accuracy : 0.56056         
##                                           
##        'Positive' Class : 1               
## 
cm$table
##               
## classification   0   1
##              0 221  31
##              1   4   5

After running the model for 50 K iterations the best K was 5 Unfortunately this K provided a Sensitivity of 13% and a Specificity of 98%. The Sensitivity is well above our minimum requirement of 60% so reject this model and move into making a Logistic regression.

cm_logit = function(predictions, reference){
  new_logit_conf_matrix <- table(reference, predictions > 0.2) 

  (new_logit_conf_matrix[[1,1]] + new_logit_conf_matrix[[2,2]]) / sum(new_logit_conf_matrix)
  classification <- ifelse(predictions > 0.2, 1, 0)

# Construct a confusion matrix
  new_logit_conf_matrix <- table(classification, reference)
  confusionMatrix(new_logit_conf_matrix, positive="1")
}

Logistic Regression

scaled_train_log_r = cbind(scaled_train, train$cl)

scaled_train_log_r <- scaled_train_log_r %>% rename(cl = "train$cl") 

scaled_test_log_r = cbind(scaled_test, test$cl)
scaled_test_log_r <- scaled_test_log_r %>% rename(cl = "test$cl") 

log_r = glm.fit <- glm(train$cl ~ ., data = scaled_train_log_r, family = "binomial")
summary(log_r)
## 
## Call:
## glm(formula = train$cl ~ ., family = "binomial", data = scaled_train_log_r)
## 
## Deviance Residuals: 
##      Min        1Q    Median        3Q       Max  
## -1.70729  -0.46315  -0.21932  -0.06207   3.00945  
## 
## Coefficients: (7 not defined because of singularities)
##                                       Estimate Std. Error z value Pr(>|z|)    
## (Intercept)                          9.210e+00  2.362e+00   3.899 9.66e-05 ***
## Age                                 -2.837e-01  2.112e-01  -1.343 0.179145    
## DailyRate                           -5.408e-02  1.511e-01  -0.358 0.720441    
## DistanceFromHome                     4.438e-01  1.466e-01   3.028 0.002463 ** 
## HourlyRate                           1.454e-01  1.528e-01   0.952 0.341325    
## NumCompaniesWorked                   6.164e-01  1.654e-01   3.726 0.000195 ***
## PercentSalaryHike                    2.358e-04  2.198e-01   0.001 0.999144    
## YearsAtCompany                       2.779e-01  3.812e-01   0.729 0.465997    
## YearsInCurrentRole                  -3.314e-01  2.546e-01  -1.302 0.193086    
## YearsSinceLastPromotion              8.103e-01  2.327e-01   3.482 0.000497 ***
## YearsWithCurrManager                -5.209e-01  2.622e-01  -1.987 0.046972 *  
## TotalWorkingYears                   -7.168e-01  3.621e-01  -1.979 0.047775 *  
## TrainingTimesLastYear               -3.297e-01  1.590e-01  -2.074 0.038095 *  
## MonthlyIncome                        9.613e-01  6.283e-01   1.530 0.126001    
## MonthlyRate                         -1.289e-01  1.509e-01  -0.854 0.393027    
## JobSatisfaction                     -4.648e-01  1.488e-01  -3.124 0.001784 ** 
## EnvironmentSatisfaction             -2.933e-01  1.538e-01  -1.906 0.056612 .  
## JobLevel                            -3.898e-01  5.564e-01  -0.701 0.483598    
## WorkLifeBalance                     -4.711e-01  1.455e-01  -3.238 0.001203 ** 
## JobInvolvement                      -6.395e-01  1.439e-01  -4.445 8.81e-06 ***
## PerformanceRating                    7.923e-02  2.262e-01   0.350 0.726174    
## Education                           -1.455e-01  1.449e-01  -1.004 0.315198    
## RelationshipSatisfaction            -3.380e-01  1.436e-01  -2.354 0.018593 *  
## StockOptionLevel                    -4.243e-01  2.258e-01  -1.879 0.060177 .  
## `BusinessTravel_Non-Travel`         -2.644e-01  1.680e-01  -1.574 0.115529    
## BusinessTravel_Travel_Frequently     2.921e-01  1.499e-01   1.949 0.051276 .  
## BusinessTravel_Travel_Rarely                NA         NA      NA       NA    
## `Department_Human Resources`        -2.604e+00  1.277e+02  -0.020 0.983732    
## `Department_Research & Development`  2.303e-01  7.002e-01   0.329 0.742216    
## Department_Sales                            NA         NA      NA       NA    
## `EducationField_Human Resources`     1.488e-01  1.736e-01   0.857 0.391419    
## `EducationField_Life Sciences`      -7.957e-02  2.848e-01  -0.279 0.779961    
## EducationField_Marketing             1.255e-02  2.339e-01   0.054 0.957226    
## EducationField_Medical              -6.753e-02  2.719e-01  -0.248 0.803875    
## EducationField_Other                -6.523e-02  1.867e-01  -0.349 0.726729    
## `EducationField_Technical Degree`           NA         NA      NA       NA    
## Gender_Female                       -1.339e-01  1.452e-01  -0.922 0.356470    
## Gender_Male                                 NA         NA      NA       NA    
## `JobRole_Healthcare Representative` -8.413e-01  4.850e-01  -1.735 0.082822 .  
## `JobRole_Human Resources`            2.230e+00  1.170e+02   0.019 0.984794    
## `JobRole_Laboratory Technician`     -5.310e-01  6.079e-01  -0.873 0.382437    
## JobRole_Manager                     -4.425e-01  3.677e-01  -1.203 0.228845    
## `JobRole_Manufacturing Director`    -1.305e+00  5.689e-01  -2.294 0.021772 *  
## `JobRole_Research Director`         -9.402e-01  4.782e-01  -1.966 0.049273 *  
## `JobRole_Research Scientist`        -8.746e-01  6.214e-01  -1.408 0.159268    
## `JobRole_Sales Executive`           -6.656e-01  2.852e-01  -2.334 0.019618 *  
## `JobRole_Sales Representative`              NA         NA      NA       NA    
## MaritalStatus_Divorced              -4.499e-01  2.445e-01  -1.840 0.065822 .  
## MaritalStatus_Married               -1.492e-01  1.968e-01  -0.758 0.448406    
## MaritalStatus_Single                        NA         NA      NA       NA    
## OverTime_No                         -9.590e-01  1.415e-01  -6.776 1.23e-11 ***
## OverTime_Yes                                NA         NA      NA       NA    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 556.76  on 608  degrees of freedom
## Residual deviance: 342.82  on 564  degrees of freedom
## AIC: 432.82
## 
## Number of Fisher Scoring iterations: 14
training_prediction <- predict(log_r, newdata = scaled_train_log_r, type = "response") 
hist(training_prediction)

cm_logit(training_prediction, scaled_train_log_r$cl)
## Confusion Matrix and Statistics
## 
##               reference
## classification   0   1
##              0 418  25
##              1  87  79
##                                          
##                Accuracy : 0.8161         
##                  95% CI : (0.783, 0.8461)
##     No Information Rate : 0.8292         
##     P-Value [Acc > NIR] : 0.8205         
##                                          
##                   Kappa : 0.4749         
##                                          
##  Mcnemar's Test P-Value : 8.216e-09      
##                                          
##             Sensitivity : 0.7596         
##             Specificity : 0.8277         
##          Pos Pred Value : 0.4759         
##          Neg Pred Value : 0.9436         
##              Prevalence : 0.1708         
##          Detection Rate : 0.1297         
##    Detection Prevalence : 0.2726         
##       Balanced Accuracy : 0.7937         
##                                          
##        'Positive' Class : 1              
## 
testing_prediction <- predict(log_r, newdata = scaled_test_log_r, type = "response")
hist(testing_prediction)

cm_logit(testing_prediction, scaled_test_log_r$cl)
## Confusion Matrix and Statistics
## 
##               reference
## classification   0   1
##              0 193   9
##              1  32  27
##                                          
##                Accuracy : 0.8429         
##                  95% CI : (0.793, 0.8849)
##     No Information Rate : 0.8621         
##     P-Value [Acc > NIR] : 0.8385424      
##                                          
##                   Kappa : 0.4792         
##                                          
##  Mcnemar's Test P-Value : 0.0005908      
##                                          
##             Sensitivity : 0.7500         
##             Specificity : 0.8578         
##          Pos Pred Value : 0.4576         
##          Neg Pred Value : 0.9554         
##              Prevalence : 0.1379         
##          Detection Rate : 0.1034         
##    Detection Prevalence : 0.2261         
##       Balanced Accuracy : 0.8039         
##                                          
##        'Positive' Class : 1              
## 

The logistic regression, with a threshold of 20% (for attrition and no attrition) give us a Sensitivity of 75% and a Specificity of 82% on the testing data. These numbers are well above the 60% minimum requirement.

Validation data

#Export validation data

val_data = read.csv("/Users/dgrijalva/SMU/Classes/Term Fall 2020/Doing-Data-Science/Units/project2/Data/CaseStudy2CompSet No Attrition.csv")


nominal_categories_val = val_data %>% select(cat_nominal_variables)

one_encode_data_val =  one_hot(as.data.table(nominal_categories_val))

predict_data = val_data %>% select(ID, num_variables, cat_nominal_features_overall)


predict_data = cbind(predict_data, one_encode_data_val)

scaled_validation = predict(pp, predict_data[,2:52])
val_prediction <- predict(log_r, newdata = scaled_validation, type = "response")
hist(testing_prediction)

pred = ifelse(val_prediction > 0.2, "Yes", "No")
val_prediction_df = data.frame(ID=val_data$ID,Attrition=pred )
write.csv(val_prediction_df, "../Data/Case2Predictions_Grijalva_Attrition.csv")

Multiple linear regression - Predicting monthly income

Now let’s move on into building a model that can predict monthly income. For this we will use the original data set not the pre-process ones used for the KNN model. The reason for this is that in R, the LM function encodes the data automatically. So there is no need for us to provide any type of encoding. For the multiple linear regression model we will also use the a forward and backward’s stepwise in order to select the useful features. For this model, the success metric is that we want a RMSE lower than 3,000.

reg_data = data %>% select(num_variables, cat_nominal_features_overall)

reg_data = cbind(reg_data, one_encode_data)
lm_fit = lm(MonthlyIncome ~.,reg_data)
step = stepAIC(lm_fit, direction = "both",trace=F)
summary(step)
## 
## Call:
## lm(formula = MonthlyIncome ~ DailyRate + DistanceFromHome + PercentSalaryHike + 
##     YearsSinceLastPromotion + YearsWithCurrManager + TotalWorkingYears + 
##     MonthlyRate + JobLevel + PerformanceRating + `BusinessTravel_Non-Travel` + 
##     BusinessTravel_Travel_Frequently + `Department_Research & Development` + 
##     Gender_Female + `JobRole_Laboratory Technician` + JobRole_Manager + 
##     `JobRole_Research Director` + `JobRole_Research Scientist` + 
##     `JobRole_Sales Executive`, data = reg_data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3792.4  -671.9   -13.8   627.9  4129.9 
## 
## Coefficients:
##                                       Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                          3.128e+02  3.929e+02   0.796 0.426120    
## DailyRate                            1.511e-01  8.934e-02   1.692 0.091107 .  
## DistanceFromHome                    -6.485e+00  4.394e+00  -1.476 0.140348    
## PercentSalaryHike                    2.356e+01  1.550e+01   1.519 0.129080    
## YearsSinceLastPromotion              2.781e+01  1.363e+01   2.040 0.041649 *  
## YearsWithCurrManager                -2.657e+01  1.229e+01  -2.162 0.030905 *  
## TotalWorkingYears                    4.864e+01  8.429e+00   5.770 1.11e-08 ***
## MonthlyRate                         -9.208e-03  5.030e-03  -1.831 0.067501 .  
## JobLevel                             2.796e+03  7.921e+01  35.293  < 2e-16 ***
## PerformanceRating                   -3.015e+02  1.586e+02  -1.901 0.057585 .  
## `BusinessTravel_Non-Travel`         -4.020e+02  1.167e+02  -3.446 0.000597 ***
## BusinessTravel_Travel_Frequently    -1.768e+02  9.412e+01  -1.879 0.060635 .  
## `Department_Research & Development`  4.886e+02  1.455e+02   3.358 0.000820 ***
## Gender_Female                       -1.130e+02  7.290e+01  -1.550 0.121616    
## `JobRole_Laboratory Technician`     -6.729e+02  1.366e+02  -4.928 1.00e-06 ***
## JobRole_Manager                      4.194e+03  2.318e+02  18.096  < 2e-16 ***
## `JobRole_Research Director`          3.935e+03  1.942e+02  20.261  < 2e-16 ***
## `JobRole_Research Scientist`        -4.288e+02  1.365e+02  -3.141 0.001743 ** 
## `JobRole_Sales Executive`            3.604e+02  1.488e+02   2.423 0.015620 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1045 on 851 degrees of freedom
## Multiple R-squared:  0.9494, Adjusted R-squared:  0.9483 
## F-statistic: 886.4 on 18 and 851 DF,  p-value: < 2.2e-16

The model has a adjusted R squared of 94% which means the 94% of the variability in monthly income is explained by the relationship between monthly income and the explanatory variables.

step
## 
## Call:
## lm(formula = MonthlyIncome ~ DailyRate + DistanceFromHome + PercentSalaryHike + 
##     YearsSinceLastPromotion + YearsWithCurrManager + TotalWorkingYears + 
##     MonthlyRate + JobLevel + PerformanceRating + `BusinessTravel_Non-Travel` + 
##     BusinessTravel_Travel_Frequently + `Department_Research & Development` + 
##     Gender_Female + `JobRole_Laboratory Technician` + JobRole_Manager + 
##     `JobRole_Research Director` + `JobRole_Research Scientist` + 
##     `JobRole_Sales Executive`, data = reg_data)
## 
## Coefficients:
##                         (Intercept)                            DailyRate  
##                           3.128e+02                            1.511e-01  
##                    DistanceFromHome                    PercentSalaryHike  
##                          -6.485e+00                            2.356e+01  
##             YearsSinceLastPromotion                 YearsWithCurrManager  
##                           2.781e+01                           -2.657e+01  
##                   TotalWorkingYears                          MonthlyRate  
##                           4.864e+01                           -9.208e-03  
##                            JobLevel                    PerformanceRating  
##                           2.796e+03                           -3.015e+02  
##         `BusinessTravel_Non-Travel`     BusinessTravel_Travel_Frequently  
##                          -4.020e+02                           -1.768e+02  
## `Department_Research & Development`                        Gender_Female  
##                           4.886e+02                           -1.130e+02  
##     `JobRole_Laboratory Technician`                      JobRole_Manager  
##                          -6.729e+02                            4.194e+03  
##         `JobRole_Research Director`         `JobRole_Research Scientist`  
##                           3.935e+03                           -4.288e+02  
##           `JobRole_Sales Executive`  
##                           3.604e+02

In the model training phase, the model identify 20 features as the ones making a significant contribution to the monthly income prediction For many of this features there is an overlap with important features that also helped explain attrition.

RSS = c(crossprod(lm_fit$residuals))
MSE = RSS / length(lm_fit$residuals)
RMSE = sqrt(MSE)
RMSE
## [1] 1029.72

The linear regression model has an RMSE way below the acceptable limit of 3,000. Because of this we will accept this version of the model.

Validate

library("readxl")
val_sal_data = read_excel("/Users/dgrijalva/SMU/Classes/Term Fall 2020/Doing-Data-Science/Units/project2/Data/CaseStudy2CompSet No Salary.xlsx")


num_variables_sal_val = c("Age", "DailyRate", "DistanceFromHome", "HourlyRate", "NumCompaniesWorked", "PercentSalaryHike", "YearsAtCompany", "YearsInCurrentRole", "YearsSinceLastPromotion", "YearsWithCurrManager","TotalWorkingYears", "TrainingTimesLastYear", "MonthlyRate", "PercentSalaryHike")


nominal_categories_val_reg = val_sal_data %>% select(cat_nominal_variables)
nominal_categories_val_reg = as.data.frame(unclass(nominal_categories_val_reg))

one_encode_data_val_reg =  one_hot(as.data.table(nominal_categories_val_reg))

predict_data_reg_val = val_sal_data %>% select(ID, num_variables_sal_val, cat_nominal_features_overall)
## Note: Using an external vector in selections is ambiguous.
## ℹ Use `all_of(num_variables_sal_val)` instead of `num_variables_sal_val` to silence this message.
## ℹ See <https://tidyselect.r-lib.org/reference/faq-external-vector.html>.
## This message is displayed once per session.
predict_data_reg_val = cbind(predict_data_reg_val, one_encode_data_val_reg)




salary_pred = predict(step,predict_data_reg_val, type = "response")
write.csv(salary_pred, "../Data/Case2Predictions_Grijalva_Salary.csv.csv")